site <- "http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv"
AD <- read.csv(site)
head(AD)
## X TV Radio Newspaper Sales
## 1 1 230.1 37.8 69.2 22.1
## 2 2 44.5 39.3 45.1 10.4
## 3 3 17.2 45.9 69.3 9.3
## 4 4 151.5 41.3 58.5 18.5
## 5 5 180.8 10.8 58.4 12.9
## 6 6 8.7 48.9 75.0 7.2
dim(AD)
## [1] 200 5
library(DT)
datatable(AD[, -1], rownames = FALSE,
caption = 'Table 1: This is a simple caption for the table.')
plot(Sales ~ TV, data = AD, col = "red", pch = 19)
mod1 <- lm(Sales ~ TV, data = AD)
abline(mod1, col = "blue")
ggplot2library(ggplot2)
library(MASS)
p <- ggplot(data = AD, aes(x = TV, y = Sales)) +
geom_point(color = "lightblue") +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
geom_smooth(method = "loess", color = "red", se = FALSE) +
geom_smooth(method = "rlm", color = "purple", se = FALSE) +
theme_bw()
p
ggvislibrary(ggvis)
AD %>%
ggvis(x = ~TV, y = ~Sales) %>%
layer_points() %>%
layer_model_predictions(model = "lm", se = FALSE) %>%
layer_model_predictions(model = "MASS::rlm", se = FALSE, stroke := "blue") %>%
layer_smooths(stroke:="red", se = FALSE)
plotlylibrary(plotly)
p2 <- ggplotly(p)
p2
library(car)
scatterplotMatrix(~ Sales + TV + Radio + Newspaper, data = AD)
Recall mod1
mod1 <- lm(Sales ~ TV, data = AD)
summary(mod1)
##
## Call:
## lm(formula = Sales ~ TV, data = AD)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.3860 -1.9545 -0.1913 2.0671 7.2124
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.032594 0.457843 15.36 <2e-16 ***
## TV 0.047537 0.002691 17.67 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.259 on 198 degrees of freedom
## Multiple R-squared: 0.6119, Adjusted R-squared: 0.6099
## F-statistic: 312.1 on 1 and 198 DF, p-value: < 2.2e-16
Residual: \(e_i = y_i - \hat{y_i}\)
To obtain the residuals for mod1 use the function resid on a linear model object.
eis <- resid(mod1)
RSS <- sum(eis^2)
RSS
## [1] 2102.531
RSE <- sqrt(RSS/(dim(AD)[1]-2))
RSE
## [1] 3.258656
# Or
summary(mod1)$sigma
## [1] 3.258656
The least squares estimators of \(\beta_0\) and \(\beta_1\) are
\[b_0 = \hat{\beta_0} = \bar{y} - b_1\bar{x}\] \[b_1 = \hat{\beta_1} = \frac{\sum_{i = 1}^n(x_i - \bar{x})(y_i - \bar{y})}{\sum_{i=1}^n(x_i-\bar{x})^2}\]
y <- AD$Sales
x <- AD$TV
b1 <- sum( (x - mean(x))*(y - mean(y)) ) / sum((x - mean(x))^2)
b0 <- mean(y) - b1*mean(x)
c(b0, b1)
## [1] 7.03259355 0.04753664
# Or using
coef(mod1)
## (Intercept) TV
## 7.03259355 0.04753664
summary(mod1)
##
## Call:
## lm(formula = Sales ~ TV, data = AD)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.3860 -1.9545 -0.1913 2.0671 7.2124
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.032594 0.457843 15.36 <2e-16 ***
## TV 0.047537 0.002691 17.67 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.259 on 198 degrees of freedom
## Multiple R-squared: 0.6119, Adjusted R-squared: 0.6099
## F-statistic: 312.1 on 1 and 198 DF, p-value: < 2.2e-16
XTXI <- summary(mod1)$cov.unscaled
MSE <- summary(mod1)$sigma^2
var.cov.b <- MSE*XTXI
var.cov.b
## (Intercept) TV
## (Intercept) 0.209620158 -1.064495e-03
## TV -0.001064495 7.239367e-06
seb0 <- sqrt(var.cov.b[1, 1])
seb1 <- sqrt(var.cov.b[2, 2])
c(seb0, seb1)
## [1] 0.457842940 0.002690607
coef(summary(mod1))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.03259355 0.457842940 15.36028 1.40630e-35
## TV 0.04753664 0.002690607 17.66763 1.46739e-42
coef(summary(mod1))[1, 2]
## [1] 0.4578429
coef(summary(mod1))[2, 2]
## [1] 0.002690607
tb0 <- b0/seb0
tb1 <- b1/seb1
c(tb0, tb1)
## [1] 15.36028 17.66763
pvalues <- c(pt(tb0, 198, lower = FALSE)*2, pt(tb1, 198, lower = FALSE)*2)
pvalues
## [1] 1.40630e-35 1.46739e-42
coef(summary(mod1))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.03259355 0.457842940 15.36028 1.40630e-35
## TV 0.04753664 0.002690607 17.66763 1.46739e-42
TSS <- sum((y - mean(y))^2)
c(RSS, TSS)
## [1] 2102.531 5417.149
R2 <- (TSS - RSS)/TSS
R2
## [1] 0.6118751
# Or
summary(mod1)$r.squared
## [1] 0.6118751
\[\text{CI}_{1 - \alpha}(\beta_1) = \left[b_1 - t_{1- \alpha/2, n - p + 1}SE(b1), b_1 + t_{1- \alpha/2, n - p + 1}SE(b1) \right]\]
Example: Construct a 90% confidence interval for \(\beta_1\).
alpha <- 0.10
ct <- qt(1 - alpha/2, 198)
ct
## [1] 1.652586
b1 +c(-1, 1)*ct*seb1
## [1] 0.04309018 0.05198310
# Or
confint(mod1, parm = "TV", level = 0.90)
## 5 % 95 %
## TV 0.04309018 0.0519831
confint(mod1)
## 2.5 % 97.5 %
## (Intercept) 6.12971927 7.93546783
## TV 0.04223072 0.05284256
Solution of linear systems Find the solution(s) if any to the following linear equations.
\[2x + y - z = 8\] \[-3x - y + 2z = -11\] \[-2x + y + 2z = -3\]
A <- matrix(c(2, -3, -2, 1, -1, 1, -1, 2, 2), nrow = 3)
b <- matrix(c(8, -11, -3), nrow = 3)
x <- solve(A)%*%b
x
## [,1]
## [1,] 2
## [2,] 3
## [3,] -1
# Or
solve(A, b)
## [,1]
## [1,] 2
## [2,] 3
## [3,] -1
See wikipedia for a review of matrix multiplication rules and properties.
Consider the 2 \(\times\) 2 matrix \(A\).
\[A = \begin{bmatrix} 2 & 4 \\ 9 & 5 \\ \end{bmatrix} \]
A <- matrix(c(2, 9, 4, 5), nrow = 2)
A
## [,1] [,2]
## [1,] 2 4
## [2,] 9 5
t(A) # Transpose of A
## [,1] [,2]
## [1,] 2 9
## [2,] 4 5
t(A)%*%A # A'A
## [,1] [,2]
## [1,] 85 53
## [2,] 53 41
solve(A)%*%A # I_2
## [,1] [,2]
## [1,] 1.000000e+00 0
## [2,] 1.110223e-16 1
zapsmall(solve(A)%*%A) # What you expect I_2
## [,1] [,2]
## [1,] 1 0
## [2,] 0 1
X <- model.matrix(mod1)
XTX <- t(X)%*%X
dim(XTX)
## [1] 2 2
XTXI <- solve(XTX)
XTXI
## (Intercept) TV
## (Intercept) 0.0197403984 -1.002458e-04
## TV -0.0001002458 6.817474e-07
# But it is best to compute this quantity using
summary(mod1)$cov.unscaled
## (Intercept) TV
## (Intercept) 0.0197403984 -1.002458e-04
## TV -0.0001002458 6.817474e-07
betahat <- XTXI%*%t(X)%*%y
betahat
## [,1]
## (Intercept) 7.03259355
## TV 0.04753664
coef(mod1)
## (Intercept) TV
## 7.03259355 0.04753664
XTXI <- summary(mod1)$cov.unscaled
MSE <- summary(mod1)$sigma^2
var.cov.b <- MSE*XTXI
var.cov.b
## (Intercept) TV
## (Intercept) 0.209620158 -1.064495e-03
## TV -0.001064495 7.239367e-06
mod2 <- lm(Sales ~ TV + Radio, data = AD)
summary(mod2)
##
## Call:
## lm(formula = Sales ~ TV + Radio, data = AD)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.7977 -0.8752 0.2422 1.1708 2.8328
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.92110 0.29449 9.919 <2e-16 ***
## TV 0.04575 0.00139 32.909 <2e-16 ***
## Radio 0.18799 0.00804 23.382 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.681 on 197 degrees of freedom
## Multiple R-squared: 0.8972, Adjusted R-squared: 0.8962
## F-statistic: 859.6 on 2 and 197 DF, p-value: < 2.2e-16
mod3 <- lm(Sales ~ TV + Radio + Newspaper, data = AD)
summary(mod3)
##
## Call:
## lm(formula = Sales ~ TV + Radio + Newspaper, data = AD)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.8277 -0.8908 0.2418 1.1893 2.8292
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.938889 0.311908 9.422 <2e-16 ***
## TV 0.045765 0.001395 32.809 <2e-16 ***
## Radio 0.188530 0.008611 21.893 <2e-16 ***
## Newspaper -0.001037 0.005871 -0.177 0.86
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.686 on 196 degrees of freedom
## Multiple R-squared: 0.8972, Adjusted R-squared: 0.8956
## F-statistic: 570.3 on 3 and 196 DF, p-value: < 2.2e-16
\[H_0: \beta_1 = \beta_2 = \beta_3 = 0\] versus the alternative \[H_1: \text{at least one } \beta_j \neq 0\]
The test statistic is \(F = \frac{(\text{TSS} - \text{RSS})/p}{\text{RSS}/(n-p-1)}\)
anova(mod3)
## Analysis of Variance Table
##
## Response: Sales
## Df Sum Sq Mean Sq F value Pr(>F)
## TV 1 3314.6 3314.6 1166.7308 <2e-16 ***
## Radio 1 1545.6 1545.6 544.0501 <2e-16 ***
## Newspaper 1 0.1 0.1 0.0312 0.8599
## Residuals 196 556.8 2.8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
SSR <- sum(anova(mod3)[1:3, 2])
MSR <- SSR/3
SSE <- anova(mod3)[4, 2]
MSE <- SSE/(200-3-1)
Fobs <- MSR/MSE
Fobs
## [1] 570.2707
pvalue <- pf(Fobs, 3, 196, lower = FALSE)
pvalue
## [1] 1.575227e-96
# Or
summary(mod3)
##
## Call:
## lm(formula = Sales ~ TV + Radio + Newspaper, data = AD)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.8277 -0.8908 0.2418 1.1893 2.8292
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.938889 0.311908 9.422 <2e-16 ***
## TV 0.045765 0.001395 32.809 <2e-16 ***
## Radio 0.188530 0.008611 21.893 <2e-16 ***
## Newspaper -0.001037 0.005871 -0.177 0.86
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.686 on 196 degrees of freedom
## Multiple R-squared: 0.8972, Adjusted R-squared: 0.8956
## F-statistic: 570.3 on 3 and 196 DF, p-value: < 2.2e-16
summary(mod3)$fstatistic
## value numdf dendf
## 570.2707 3.0000 196.0000
Suppose we would like to test whether \(\beta_2 = \beta_3 = 0\). The reduced model with \(\beta_2 = \beta_3 = 0\) is mod1 while the full model is mod3.
summary(mod3)
##
## Call:
## lm(formula = Sales ~ TV + Radio + Newspaper, data = AD)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.8277 -0.8908 0.2418 1.1893 2.8292
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.938889 0.311908 9.422 <2e-16 ***
## TV 0.045765 0.001395 32.809 <2e-16 ***
## Radio 0.188530 0.008611 21.893 <2e-16 ***
## Newspaper -0.001037 0.005871 -0.177 0.86
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.686 on 196 degrees of freedom
## Multiple R-squared: 0.8972, Adjusted R-squared: 0.8956
## F-statistic: 570.3 on 3 and 196 DF, p-value: < 2.2e-16
anova(mod1, mod3)
## Analysis of Variance Table
##
## Model 1: Sales ~ TV
## Model 2: Sales ~ TV + Radio + Newspaper
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 198 2102.53
## 2 196 556.83 2 1545.7 272.04 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.fs <- lm(Sales ~ 1, data = AD)
SCOPE <- (~. + TV + Radio + Newspaper)
add1(mod.fs, scope = SCOPE, test = "F")
## Single term additions
##
## Model:
## Sales ~ 1
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 5417.1 661.80
## TV 1 3314.6 2102.5 474.52 312.145 < 2.2e-16 ***
## Radio 1 1798.7 3618.5 583.10 98.422 < 2.2e-16 ***
## Newspaper 1 282.3 5134.8 653.10 10.887 0.001148 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.fs <- update(mod.fs, .~. + TV)
add1(mod.fs, scope = SCOPE, test = "F")
## Single term additions
##
## Model:
## Sales ~ TV
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 2102.53 474.52
## Radio 1 1545.62 556.91 210.82 546.74 < 2.2e-16 ***
## Newspaper 1 183.97 1918.56 458.20 18.89 2.217e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.fs <- update(mod.fs, .~. + Radio)
add1(mod.fs, scope = SCOPE, test = "F")
## Single term additions
##
## Model:
## Sales ~ TV + Radio
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 556.91 210.82
## Newspaper 1 0.088717 556.83 212.79 0.0312 0.8599
summary(mod.fs)
##
## Call:
## lm(formula = Sales ~ TV + Radio, data = AD)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.7977 -0.8752 0.2422 1.1708 2.8328
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.92110 0.29449 9.919 <2e-16 ***
## TV 0.04575 0.00139 32.909 <2e-16 ***
## Radio 0.18799 0.00804 23.382 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.681 on 197 degrees of freedom
## Multiple R-squared: 0.8972, Adjusted R-squared: 0.8962
## F-statistic: 859.6 on 2 and 197 DF, p-value: < 2.2e-16
stepAICstepAIC(lm(Sales ~ 1, data = AD), scope = .~TV + Radio + Newspaper, direction = "forward", test = "F")
## Start: AIC=661.8
## Sales ~ 1
##
## Df Sum of Sq RSS AIC F Value Pr(F)
## + TV 1 3314.6 2102.5 474.52 312.145 < 2.2e-16 ***
## + Radio 1 1798.7 3618.5 583.10 98.422 < 2.2e-16 ***
## + Newspaper 1 282.3 5134.8 653.10 10.887 0.001148 **
## <none> 5417.1 661.80
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=474.52
## Sales ~ TV
##
## Df Sum of Sq RSS AIC F Value Pr(F)
## + Radio 1 1545.62 556.91 210.82 546.74 < 2.2e-16 ***
## + Newspaper 1 183.97 1918.56 458.20 18.89 2.217e-05 ***
## <none> 2102.53 474.52
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=210.82
## Sales ~ TV + Radio
##
## Df Sum of Sq RSS AIC F Value Pr(F)
## <none> 556.91 210.82
## + Newspaper 1 0.088717 556.83 212.79 0.031228 0.8599
##
## Call:
## lm(formula = Sales ~ TV + Radio, data = AD)
##
## Coefficients:
## (Intercept) TV Radio
## 2.92110 0.04575 0.18799
mod.be <- lm(Sales ~ TV + Radio + Newspaper, data = AD)
drop1(mod.be, test = "F")
## Single term deletions
##
## Model:
## Sales ~ TV + Radio + Newspaper
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 556.8 212.79
## TV 1 3058.01 3614.8 584.90 1076.4058 <2e-16 ***
## Radio 1 1361.74 1918.6 458.20 479.3252 <2e-16 ***
## Newspaper 1 0.09 556.9 210.82 0.0312 0.8599
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.be <- update(mod.be, .~. - Newspaper)
drop1(mod.be, test = "F")
## Single term deletions
##
## Model:
## Sales ~ TV + Radio
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 556.9 210.82
## TV 1 3061.6 3618.5 583.10 1082.98 < 2.2e-16 ***
## Radio 1 1545.6 2102.5 474.52 546.74 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(mod.be)
##
## Call:
## lm(formula = Sales ~ TV + Radio, data = AD)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.7977 -0.8752 0.2422 1.1708 2.8328
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.92110 0.29449 9.919 <2e-16 ***
## TV 0.04575 0.00139 32.909 <2e-16 ***
## Radio 0.18799 0.00804 23.382 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.681 on 197 degrees of freedom
## Multiple R-squared: 0.8972, Adjusted R-squared: 0.8962
## F-statistic: 859.6 on 2 and 197 DF, p-value: < 2.2e-16
stepAICstepAIC(lm(Sales ~ TV + Radio + Newspaper, data = AD), scope = .~TV + Radio + Newspaper, direction = "backward", test = "F")
## Start: AIC=212.79
## Sales ~ TV + Radio + Newspaper
##
## Df Sum of Sq RSS AIC F Value Pr(F)
## - Newspaper 1 0.09 556.9 210.82 0.03 0.8599
## <none> 556.8 212.79
## - Radio 1 1361.74 1918.6 458.20 479.33 <2e-16 ***
## - TV 1 3058.01 3614.8 584.90 1076.41 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=210.82
## Sales ~ TV + Radio
##
## Df Sum of Sq RSS AIC F Value Pr(F)
## <none> 556.9 210.82
## - Radio 1 1545.6 2102.5 474.52 546.74 < 2.2e-16 ***
## - TV 1 3061.6 3618.5 583.10 1082.98 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Call:
## lm(formula = Sales ~ TV + Radio, data = AD)
##
## Coefficients:
## (Intercept) TV Radio
## 2.92110 0.04575 0.18799
residualPlots(mod2)
## Test stat Pr(>|t|)
## TV -6.775 0.000
## Radio 1.054 0.293
## Tukey test 7.635 0.000
qqPlot(mod2)
influenceIndexPlot(mod2)
We use a confidence interval to quantify the uncertainty surrounding the average Sales over a large number of cities. For example, given that $100,000 is spent on TV advertising and $20,000 is spent on Radio advertising in each city, the 95% confidence interval is [10.9852544, 11.5276775]. We interpret this to mean that 95% of intervals of this form will contain the true value of Sales.
predict(mod.be, newdata = data.frame(TV = 100, Radio = 20), interval = "conf")
## fit lwr upr
## 1 11.25647 10.98525 11.52768
On the other hand, a prediction interval can be used to quantify the uncertainty surrounding Sales for a particular city. Given that $100,000 is spent on TV advertising and $20,000 is spent on Radio advertising in a particular city, the 95% prediction interval is [7.9296161, 14.5833158]. We interpret this to mean that 95% of intervals of this form will contain the true value of Sales for this city.
predict(mod.be, newdata = data.frame(TV = 100, Radio = 20), interval = "pred")
## fit lwr upr
## 1 11.25647 7.929616 14.58332
Note that both the intervals are centered at 11.256466, but that the prediction interval is substantially wider than the confidence interval, reflecting the increased uncertainty about Sales for a given city in comparison to the average Sales over many locations.
nam1 <- lm(Sales ~ TV*Radio, data = AD)
# Same as
nam2 <- lm(Sales ~ TV + Radio + TV:Radio, data = AD)
summary(nam1)
##
## Call:
## lm(formula = Sales ~ TV * Radio, data = AD)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.3366 -0.4028 0.1831 0.5948 1.5246
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.750e+00 2.479e-01 27.233 <2e-16 ***
## TV 1.910e-02 1.504e-03 12.699 <2e-16 ***
## Radio 2.886e-02 8.905e-03 3.241 0.0014 **
## TV:Radio 1.086e-03 5.242e-05 20.727 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9435 on 196 degrees of freedom
## Multiple R-squared: 0.9678, Adjusted R-squared: 0.9673
## F-statistic: 1963 on 3 and 196 DF, p-value: < 2.2e-16
summary(nam2)
##
## Call:
## lm(formula = Sales ~ TV + Radio + TV:Radio, data = AD)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.3366 -0.4028 0.1831 0.5948 1.5246
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.750e+00 2.479e-01 27.233 <2e-16 ***
## TV 1.910e-02 1.504e-03 12.699 <2e-16 ***
## Radio 2.886e-02 8.905e-03 3.241 0.0014 **
## TV:Radio 1.086e-03 5.242e-05 20.727 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9435 on 196 degrees of freedom
## Multiple R-squared: 0.9678, Adjusted R-squared: 0.9673
## F-statistic: 1963 on 3 and 196 DF, p-value: < 2.2e-16
Hierarchical Principle: If an interaction term is included in a model, one should also include the main effects, even if the p-values associated with their coefficients are not significant.
In the Credit data frame there are four qualitative features/variables Gender, Student, Married, and Ethnicity.
Credit <- read.csv("http://www-bcf.usc.edu/~gareth/ISL/Credit.csv")
datatable(Credit[, -1], rownames = FALSE)
modP <- lm(Balance ~ Income*Student, data = Credit)
summary(modP)
##
## Call:
## lm(formula = Balance ~ Income * Student, data = Credit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -773.39 -325.70 -41.13 321.65 814.04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 200.6232 33.6984 5.953 5.79e-09 ***
## Income 6.2182 0.5921 10.502 < 2e-16 ***
## StudentYes 476.6758 104.3512 4.568 6.59e-06 ***
## Income:StudentYes -1.9992 1.7313 -1.155 0.249
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 391.6 on 396 degrees of freedom
## Multiple R-squared: 0.2799, Adjusted R-squared: 0.2744
## F-statistic: 51.3 on 3 and 396 DF, p-value: < 2.2e-16
Fitted Model: \(\widehat{\text{Balance}} = 200.6231529 + 6.2181687\cdot \text{Income} + 476.6758432\cdot \text{Student} + -1.9991509\cdot\text{Income}\times\text{Student}\)
Suppose we wish to investigate differences in credit card balance between males and females, ignoring the other variables for the moment.
modS <- lm(Balance ~ Gender, data = Credit)
summary(modS)
##
## Call:
## lm(formula = Balance ~ Gender, data = Credit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -529.54 -455.35 -60.17 334.71 1489.20
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 529.54 31.99 16.554 <2e-16 ***
## Gender Male -19.73 46.05 -0.429 0.669
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 460.2 on 398 degrees of freedom
## Multiple R-squared: 0.0004611, Adjusted R-squared: -0.00205
## F-statistic: 0.1836 on 1 and 398 DF, p-value: 0.6685
coef(modS)
## (Intercept) Gender Male
## 529.53623 -19.73312
tapply(Credit$Balance, Credit$Gender, mean)
## Female Male
## 529.5362 509.8031
library(ggplot2)
ggplot(data = Credit, aes(x = Gender, y = Balance)) +
geom_point() +
theme_bw() +
geom_hline(yintercept = coef(modS)[1] + coef(modS)[2], color = "blue") +
geom_hline(yintercept = coef(modS)[1], color = "pink")
modS1 <- lm(Balance ~ Limit + Student, data = Credit)
summary(modS1)
##
## Call:
## lm(formula = Balance ~ Limit + Student, data = Credit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -637.77 -116.90 6.04 130.92 434.24
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.347e+02 2.307e+01 -14.51 <2e-16 ***
## Limit 1.720e-01 4.331e-03 39.70 <2e-16 ***
## StudentYes 4.044e+02 3.328e+01 12.15 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 199.7 on 397 degrees of freedom
## Multiple R-squared: 0.8123, Adjusted R-squared: 0.8114
## F-statistic: 859.2 on 2 and 397 DF, p-value: < 2.2e-16
coef(modS1)
## (Intercept) Limit StudentYes
## -334.7299372 0.1719538 404.4036438
# Interaction --- Non-additive Model
modS2 <- lm(Balance ~ Limit*Student, data = Credit)
summary(modS2)
##
## Call:
## lm(formula = Balance ~ Limit * Student, data = Credit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -705.84 -116.90 6.91 133.97 435.92
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.262e+02 2.392e+01 -13.636 < 2e-16 ***
## Limit 1.702e-01 4.533e-03 37.538 < 2e-16 ***
## StudentYes 3.091e+02 7.878e+01 3.924 0.000103 ***
## Limit:StudentYes 2.028e-02 1.520e-02 1.334 0.183010
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 199.5 on 396 degrees of freedom
## Multiple R-squared: 0.8132, Adjusted R-squared: 0.8118
## F-statistic: 574.5 on 3 and 396 DF, p-value: < 2.2e-16
Several points:
ggplot2 graphing below?ggplot(data = Credit, aes(x = Limit, y = Balance, color = Student)) +
geom_point() +
stat_smooth(method = "lm")
ggplot(data = Credit, aes(x = Limit, y = Balance, color = Student)) +
geom_point() +
theme_bw() +
geom_abline(intercept = coef(modS1)[1], slope = coef(modS1)[2], color = "red") +
geom_abline(intercept = coef(modS1)[1] + coef(modS1)[3], slope = coef(modS1)[2], color = "blue")
modQ3 <- lm(Balance ~ Limit + Ethnicity, data = Credit)
summary(modQ3)
##
## Call:
## lm(formula = Balance ~ Limit + Ethnicity, data = Credit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -677.39 -145.75 -8.75 139.56 776.46
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.078e+02 3.417e+01 -9.007 <2e-16 ***
## Limit 1.718e-01 5.079e-03 33.831 <2e-16 ***
## EthnicityAsian 2.835e+01 3.304e+01 0.858 0.391
## EthnicityCaucasian 1.381e+01 2.878e+01 0.480 0.632
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 234 on 396 degrees of freedom
## Multiple R-squared: 0.743, Adjusted R-squared: 0.7411
## F-statistic: 381.6 on 3 and 396 DF, p-value: < 2.2e-16
coef(modQ3)
## (Intercept) Limit EthnicityAsian
## -307.7574777 0.1718203 28.3533975
## EthnicityCaucasian
## 13.8089629
modRM <- lm(Balance ~ Limit, data = Credit)
anova(modRM, modQ3)
## Analysis of Variance Table
##
## Model 1: Balance ~ Limit
## Model 2: Balance ~ Limit + Ethnicity
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 398 21715657
## 2 396 21675307 2 40350 0.3686 0.6919
What follows fits three separate regression lines based on Ethnicity.
AfAmer <- lm(Balance ~ Limit, data = subset(Credit, Ethnicity == "African American"))
AsAmer <- lm(Balance ~ Limit, data = subset(Credit, Ethnicity == "Asian"))
CaAmer <- lm(Balance ~ Limit, data = subset(Credit, Ethnicity == "Caucasian"))
rbind(coef(AfAmer), coef(AsAmer), coef(CaAmer))
## (Intercept) Limit
## [1,] -301.2245 0.1704820
## [2,] -305.4270 0.1774679
## [3,] -282.4442 0.1693873
ggplot(data = Credit, aes(x = Limit, y = Balance, color = Ethnicity)) +
geom_point() +
theme_bw() +
stat_smooth(method = "lm", se = FALSE)
Note: Ethnicity is not significant, so we really should have just one line.
ggplot(data = Credit, aes(x = Limit, y = Balance)) +
geom_point(aes(color = Ethnicity)) +
theme_bw() +
stat_smooth(method = "lm")
scatterplotMatrix(~ Balance + Income + Limit + Rating + Cards + Age + Education + Gender + Student + Married + Ethnicity, data = Credit)
modC <- stepAIC(lm(Balance ~ Income + Limit + Rating + Cards + Age + Education + Gender + Student + Married + Ethnicity, data = Credit), direction = "backward", test = "F")
## Start: AIC=3686.22
## Balance ~ Income + Limit + Rating + Cards + Age + Education +
## Gender + Student + Married + Ethnicity
##
## Df Sum of Sq RSS AIC F Value Pr(F)
## - Ethnicity 2 14084 3800814 3683.7 0.72 0.48665
## - Education 1 4615 3791345 3684.7 0.47 0.49207
## - Married 1 6619 3793349 3684.9 0.68 0.41073
## - Gender 1 11269 3798000 3685.4 1.15 0.28324
## <none> 3786730 3686.2
## - Age 1 42558 3829288 3688.7 4.36 0.03743 *
## - Rating 1 52314 3839044 3689.7 5.36 0.02112 *
## - Cards 1 162702 3949432 3701.0 16.67 5.401e-05 ***
## - Limit 1 331050 4117780 3717.7 33.92 1.206e-08 ***
## - Student 1 6326012 10112742 4077.1 648.18 < 2.2e-16 ***
## - Income 1 10831162 14617892 4224.5 1109.79 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=3683.7
## Balance ~ Income + Limit + Rating + Cards + Age + Education +
## Gender + Student + Married
##
## Df Sum of Sq RSS AIC F Value Pr(F)
## - Married 1 4545 3805359 3682.2 0.47 0.49506
## - Education 1 4757 3805572 3682.2 0.49 0.48517
## - Gender 1 10760 3811574 3682.8 1.10 0.29403
## <none> 3800814 3683.7
## - Age 1 45650 3846464 3686.5 4.68 0.03105 *
## - Rating 1 49473 3850287 3686.9 5.08 0.02481 *
## - Cards 1 166806 3967621 3698.9 17.12 4.310e-05 ***
## - Limit 1 340250 4141064 3716.0 34.91 7.521e-09 ***
## - Student 1 6372573 10173387 4075.5 653.89 < 2.2e-16 ***
## - Income 1 10838891 14639705 4221.1 1112.17 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=3682.18
## Balance ~ Income + Limit + Rating + Cards + Age + Education +
## Gender + Student
##
## Df Sum of Sq RSS AIC F Value Pr(F)
## - Education 1 5399 3810759 3680.7 0.55 0.45682
## - Gender 1 11019 3816378 3681.3 1.13 0.28797
## <none> 3805359 3682.2
## - Age 1 43545 3848904 3684.7 4.47 0.03504 *
## - Rating 1 46929 3852289 3685.1 4.82 0.02869 *
## - Cards 1 170729 3976088 3697.7 17.54 3.475e-05 ***
## - Limit 1 352112 4157472 3715.6 36.18 4.137e-09 ***
## - Student 1 6461978 10267338 4077.2 663.97 < 2.2e-16 ***
## - Income 1 10860901 14666261 4219.8 1115.96 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=3680.75
## Balance ~ Income + Limit + Rating + Cards + Age + Gender + Student
##
## Df Sum of Sq RSS AIC F Value Pr(F)
## - Gender 1 10861 3821620 3679.9 1.12 0.29117
## <none> 3810759 3680.7
## - Age 1 43983 3854741 3683.3 4.52 0.03404 *
## - Rating 1 49520 3860279 3683.9 5.09 0.02456 *
## - Cards 1 170898 3981656 3696.3 17.58 3.409e-05 ***
## - Limit 1 347654 4158413 3713.7 35.76 5.023e-09 ***
## - Student 1 6472072 10282831 4075.8 665.76 < 2.2e-16 ***
## - Income 1 10855780 14666539 4217.8 1116.70 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=3679.89
## Balance ~ Income + Limit + Rating + Cards + Age + Student
##
## Df Sum of Sq RSS AIC F Value Pr(F)
## <none> 3821620 3679.9
## - Age 1 44472 3866091 3682.5 4.57 0.03309 *
## - Rating 1 49263 3870883 3683.0 5.07 0.02495 *
## - Cards 1 172930 3994549 3695.6 17.78 3.075e-05 ***
## - Limit 1 347898 4169518 3712.7 35.78 4.980e-09 ***
## - Student 1 6462599 10284218 4073.9 664.59 < 2.2e-16 ***
## - Income 1 10845018 14666638 4215.8 1115.26 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modC
##
## Call:
## lm(formula = Balance ~ Income + Limit + Rating + Cards + Age +
## Student, data = Credit)
##
## Coefficients:
## (Intercept) Income Limit Rating Cards
## -493.7342 -7.7951 0.1937 1.0912 18.2119
## Age StudentYes
## -0.6241 425.6099
modD <- stepAIC(lm(Balance ~ 1, data = Credit), scope = ~ . + Income + Limit + Cards + Age + Education + Gender + Student + Married + Ethnicity, direction = "forward", test = "F")
## Start: AIC=4905.56
## Balance ~ 1
##
## Df Sum of Sq RSS AIC F Value Pr(F)
## + Limit 1 62624255 21715657 4364.8 1147.76 < 2.2e-16 ***
## + Income 1 18131167 66208745 4810.7 108.99 < 2.2e-16 ***
## + Student 1 5658372 78681540 4879.8 28.62 1.488e-07 ***
## + Cards 1 630416 83709496 4904.6 3.00 0.08418 .
## <none> 84339912 4905.6
## + Gender 1 38892 84301020 4907.4 0.18 0.66852
## + Education 1 5481 84334431 4907.5 0.03 0.87231
## + Married 1 2715 84337197 4907.5 0.01 0.90994
## + Age 1 284 84339628 4907.6 0.00 0.97081
## + Ethnicity 2 18454 84321458 4909.5 0.04 0.95749
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=4364.83
## Balance ~ Limit
##
## Df Sum of Sq RSS AIC F Value Pr(F)
## + Income 1 10844825 10870832 4090.1 396.05 < 2.2e-16 ***
## + Student 1 5887310 15828347 4240.3 147.66 < 2.2e-16 ***
## + Age 1 617067 21098589 4355.3 11.61 0.0007226 ***
## + Cards 1 508452 21207205 4357.4 9.52 0.0021768 **
## <none> 21715657 4364.8
## + Married 1 89278 21626379 4365.2 1.64 0.2012253
## + Gender 1 15093 21700563 4366.6 0.28 0.5995469
## + Education 1 12622 21703034 4366.6 0.23 0.6311289
## + Ethnicity 2 40350 21675307 4368.1 0.37 0.6919457
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=4090.05
## Balance ~ Limit + Income
##
## Df Sum of Sq RSS AIC F Value Pr(F)
## + Student 1 6553835 4316997 3722.6 601.19 < 2.2e-16 ***
## + Cards 1 326363 10544469 4079.9 12.26 0.0005164 ***
## + Age 1 73681 10797151 4089.3 2.70 0.1009938
## + Married 1 57405 10813427 4089.9 2.10 0.1478749
## <none> 10870832 4090.1
## + Education 1 4042 10866790 4091.9 0.15 0.7013505
## + Gender 1 614 10870218 4092.0 0.02 0.8811968
## + Ethnicity 2 46207 10824625 4092.3 0.84 0.4311630
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=3722.64
## Balance ~ Limit + Income + Student
##
## Df Sum of Sq RSS AIC F Value Pr(F)
## + Cards 1 401938 3915058 3685.6 40.553 5.324e-10 ***
## + Age 1 32050 4284947 3721.7 2.954 0.08643 .
## <none> 4316997 3722.6
## + Education 1 15047 4301949 3723.2 1.382 0.24053
## + Gender 1 14324 4302673 3723.3 1.315 0.25219
## + Married 1 1682 4315315 3724.5 0.154 0.69500
## + Ethnicity 2 12831 4304166 3725.5 0.587 0.55633
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=3685.55
## Balance ~ Limit + Income + Student + Cards
##
## Df Sum of Sq RSS AIC F Value Pr(F)
## + Age 1 44176 3870883 3683.0 4.4965 0.03459 *
## <none> 3915058 3685.6
## + Gender 1 11086 3903972 3686.4 1.1188 0.29082
## + Education 1 8304 3906754 3686.7 0.8375 0.36067
## + Married 1 1151 3913908 3687.4 0.1158 0.73378
## + Ethnicity 2 12534 3902524 3688.3 0.6311 0.53253
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=3683.01
## Balance ~ Limit + Income + Student + Cards + Age
##
## Df Sum of Sq RSS AIC F Value Pr(F)
## <none> 3870883 3683.0
## + Gender 1 10603.7 3860279 3683.9 1.07952 0.2994
## + Education 1 7793.4 3863089 3684.2 0.79284 0.3738
## + Married 1 2661.7 3868221 3684.7 0.27042 0.6033
## + Ethnicity 2 9638.6 3861244 3686.0 0.48926 0.6135
modD
##
## Call:
## lm(formula = Balance ~ Limit + Income + Student + Cards + Age,
## data = Credit)
##
## Coefficients:
## (Intercept) Limit Income StudentYes Cards
## -467.3345 0.2661 -7.7595 428.3786 23.5504
## Age
## -0.6220
# Predict
predict(modC, newdata = data.frame(Income = 80, Limit = 5000, Cards = 3, Age = 52, Student = "No", Rating = 800), interval = "pred")
## fit lwr upr
## 1 746.251 295.5691 1196.933
residualPlots(modC)
## Test stat Pr(>|t|)
## Income 2.442 0.015
## Limit 11.709 0.000
## Rating 11.214 0.000
## Cards -0.692 0.489
## Age -0.159 0.874
## Student NA NA
## Tukey test 25.379 0.000
qqPlot(modC)
influenceIndexPlot(modC)
library(ISLR)
car1 <- lm(mpg ~ horsepower, data = Auto)
car2 <- lm(mpg ~ poly(horsepower, 2), data = Auto)
car5 <- lm(mpg ~ poly(horsepower, 5), data = Auto)
xs <- seq(min(Auto$horsepower), max(Auto$horsepower), length = 500)
y1 <- predict(car1, newdata = data.frame(horsepower = xs))
y2 <- predict(car2, newdata = data.frame(horsepower = xs))
y5 <- predict(car5, newdata = data.frame(horsepower = xs))
DF <- data.frame(x = xs, y1 = y1, y2 = y2, y5 = y5)
ggplot(data = Auto, aes(x = horsepower, y = mpg)) +
geom_point() +
theme_bw() +
geom_line(data = DF, aes(x = x, y = y1), color = "red") +
geom_line(data = DF, aes(x = x, y = y2), color = "blue") +
geom_line(data = DF, aes(x = x, y = y5), color = "green")
ggplot(data = Auto, aes(x = horsepower, y = mpg)) +
geom_point(color = "lightblue") +
theme_bw() +
stat_smooth(method = "lm", data = Auto, color = "red", se = FALSE) +
stat_smooth(method = "lm", formula = y ~ poly(x, 2), data = Auto, color = "blue", se = FALSE) +
stat_smooth(method = "lm", formula = y ~ poly(x, 5), data = Auto, color = "green", se = FALSE)
newC <- update(modC, .~. - Limit - Income - Rating + poly(Income, 2) + poly(Limit, 4))
summary(newC)
##
## Call:
## lm(formula = Balance ~ Cards + Age + Student + poly(Income, 2) +
## poly(Limit, 4), data = Credit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -365.08 -32.62 -4.70 32.36 219.66
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 458.4208 13.2687 34.549 < 2e-16 ***
## Cards 22.8068 2.4225 9.415 < 2e-16 ***
## Age -0.8707 0.1960 -4.441 1.17e-05 ***
## StudentYes 426.1013 11.1190 38.322 < 2e-16 ***
## poly(Income, 2)1 -6641.4002 136.3325 -48.715 < 2e-16 ***
## poly(Income, 2)2 -290.5402 91.9410 -3.160 0.0017 **
## poly(Limit, 4)1 13214.2696 126.2891 104.635 < 2e-16 ***
## poly(Limit, 4)2 1401.5188 100.4033 13.959 < 2e-16 ***
## poly(Limit, 4)3 -860.8780 70.5149 -12.208 < 2e-16 ***
## poly(Limit, 4)4 488.6575 67.9057 7.196 3.20e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 66.1 on 390 degrees of freedom
## Multiple R-squared: 0.9798, Adjusted R-squared: 0.9793
## F-statistic: 2101 on 9 and 390 DF, p-value: < 2.2e-16
residualPlots(newC)
## Test stat Pr(>|t|)
## Cards 0.177 0.860
## Age -0.469 0.639
## Student NA NA
## poly(Income, 2) NA NA
## poly(Limit, 4) NA NA
## Tukey test 13.439 0.000
qqPlot(newC)
influenceIndexPlot(newC)